Adamson smallRNA - pegRNA

Author

Lance Parsons

Load libraries

This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().

install.packages("renv")
renv::restore()

Read data

  1. Get list of samples
Code
samples <- read_tsv("config/samples.tsv", show_col_types = FALSE)
units <- read_tsv("config/units.tsv", show_col_types = FALSE)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE)
sample_units
  1. Read Samtools idxstats to get human coverage for normalization

Notes:

  • The counts include the total number of reads aligned, they are not limited to uniquely aligned reads.
  • The counts are reads, not pairs or fragments
Code
idxstats_exogenousrna_dir <-
  "results/samtools_idxstats/exogenous_rna/"

idxstats_human_dir <-
  "results/samtools_idxstats/Homo_sapiens.GRCh38.dna.primary_assembly/"

bowtie2_human_logs <-
  "results/logs/bowtie2/Homo_sapiens.GRCh38.dna.primary_assembly/"

idxstats <- tibble()

for (row in seq_len(nrow(sample_units))) {
  sample <- sample_units[row, ]$sample_unit

  # Read `idsxstats` for exogenous mapped reads
  exogenous_rna_stats <- read_tsv(
    file.path(idxstats_exogenousrna_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  exogenous_rna_mapped_reads <- exogenous_rna_stats %>%
    filter(!sequence_name %in% c("*")) %>%
    select(sequence_name, mapped_reads) %>%
    mutate(sample = sample)

  # Read `idxstats` for human mapped reads
  human_stats <- read_tsv(
    file.path(idxstats_human_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  grch38_mapped_reads <- human_stats %>%
    filter(!sequence_name %in% c("*")) %>%
    select(mapped_reads) %>%
    sum()
  grch38_mapped_reads <- tibble(
    sequence_name = "grch38_mapped_reads",
    mapped_reads = grch38_mapped_reads,
    sample = sample
  )

  # Read bowtie2 logs for unmapped reads
  bowtie2_log <- readLines(
    file.path(bowtie2_human_logs, sprintf("%s.log", sample))
  )
  total_pairs <- strtoi(str_split(bowtie2_log[1], " ")[[1]][1])
  total_reads <- total_pairs * 2
  unmapped_reads <- tibble(
    sequence_name = "unmapped",
    mapped_reads = total_reads - grch38_mapped_reads$mapped_reads,
    sample = sample
  )

  # Consolidate counts for rows
  idxstats <- rbind(
    idxstats,
    exogenous_rna_mapped_reads,
    grch38_mapped_reads,
    unmapped_reads
  )
}
idxstats

Coverage

Concordant vs Discordant paired reads

Concordant pairs are pairs of reads that:

  • Align on the same pegRNA
  • Align within 500 bp of each other
  • Align in the expected forward-reverse orientation (--> .. <--)

Discordant reads aligned but whose mate:

  • Did not align (on the pegRNA)
  • Aligned more than 500 bp away
  • Aligned in an unexpected orientation
Code
## Config and function definition

bam_dir <- "results/alignments/exogenous_rna/sorted"

last_day <- 0
cols <- brewer.pal(n = 5, name = "RdBu")

concordant_cell_line_colors <- list(
  "Parental" = "#CA0020",
  "P1E10" = "#0571B0"
)

discordant_cell_line_colors <- list(
  "Parental" = "#F4A582",
  "P1E10" = "#92C5DE"
)

# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103", "PJY300")) {
  t <- readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}
rna_mixes <- rna_mixes %>%
  rows_update(tibble(exogenous_rna = "PJY103", start = 331, end = 460),
    by = "exogenous_rna"
  ) %>%
  rows_update(tibble(exogenous_rna = "PJY300", start = 331, end = 497),
    by = "exogenous_rna"
  )
rna_mixes
Code
source("pegrna_plots.R")

PJY103 / PJY300 (Batch 2)

Code
sample_list <- rna_mixes %>% filter(grepl("PJY103|PJY300", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[[i, "rna_species"]]
      mix <- sample_list[[i, "exogenous_rna"]]

      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        mix = mix,
        ylab = sprintf("%s coverage (normalized to %s)", mix, norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

VEGFA

Code
sample_list <- rna_mixes %>% filter(grepl("VEGFA", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[[i, "rna_species"]]
      mix <- sample_list[[i, "exogenous_rna"]]

      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        mix = mix,
        ylab = sprintf("%s coverage (normalized to %s)", mix, norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

FANCF

Code
sample_list <- rna_mixes %>% filter(grepl("FANCF", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

HEK3

Code
sample_list <- rna_mixes %>% filter(grepl("HEK3", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

DNMT1

Code
sample_list <- rna_mixes %>% filter(grepl("DNMT1", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

RUNX1

Code
sample_list <- rna_mixes %>% filter(grepl("RUNX1", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

EMX1

Code
sample_list <- rna_mixes %>% filter(grepl("EMX1", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

RNF2

Code
sample_list <- rna_mixes %>% filter(grepl("RNF2", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}

Strandedness

Code
source("pegrna_alignment_strandedness.R")
for (rna_species in rna_mixes %>%
  select(rna_species) %>%
  unique() %>%
  pull()) {
  stranded_counts <- pegrna_alignment_strandedness(sequence_name = rna_species)
  p <- ggplot(
    data = stranded_counts,
    aes(x = sample_unit, y = count, fill = strand)
  ) +
    geom_bar(stat = "identity") +
    labs(title = rna_species) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  print(p)
}